require(stats)
require(graphics)
#library("ggplot2")
require(ggplot2)
#library("grDevices")
require(grDevices)
#library("psych")
require(psych)
#library(gridExtra)
require("gridExtra")
require(grid)
#library("reshape2")
require(reshape2)
#library(plyr)
require(plyr)
require(visreg)
#install.packages("ggthemes")
require(ggthemes)
library(car)
#install.packages("devtools")
library(devtools)
#install_github("easyGgplot2", "kassambara")
library(easyGgplot2)

# Please make sure to change the path to where the appropriate dataset is located on your computer


sc.fam10 <-  read.csv("PNAS_FAM_G_K1.csv") 
sc.ls10<-  read.csv("PNAS_LS_G_K1.csv") 

lne1 <- read.csv("census_wages.csv")

sponsor.require <- 1
high.skilled.p <-  2
ls.p <-  3
p.tourist.visa <- 4
p.fakedocs <-  5
no.econ.stud <- 6
p.all.illegal <- 7
k <- 8
my.seed <- 9

legaltourist.attempts <- c(10:29)

ability.legal.cols <- c(30:90)
ability.fakedocs.cols <- c(91:151)
ability.work.cols <- c(152:212)

legal.cols <- c(274:334)
fi.cols <- c(335:395)
sl.cols <- c(396:456)
stud.mig <- c(457:517)
hs.mig <- c(518:578)
ls.mig <- c(579:639)
fam.mig <- c(640:700)

num.agents <-  1166 
aspire.enough <- 507
time.point <- 20

table(sc.ls10[,ls.p])
table(sc.fam10[,sponsor.require])

# Make all settings equal in length

unique.lsp <- unique(sc.ls10[,ls.p])
unique.sr <- unique(sc.fam10[,sponsor.require])

# Create indices of 1000 first rows for each unique value of policy

curr.ind <-  NA
red.ind.ls10 <- vector(length = (length(unique.lsp) * 1000))
red.ind.fam10 <- vector(length = (length(unique.sr) * 1000))

for (i in 1:length(unique.lsp))
{
  curr.ind <- which(sc.ls10[,ls.p] == unique.lsp[i])
  red.ind.ls10[(((i - 1) * 1000) + 1): (((i - 1) * 1000) + 1000)] <- curr.ind[1:1000]
}

for (i in 1:length(unique.sr))
{
  curr.ind <- which(sc.fam10[,sponsor.require] == unique.sr[i])
  red.ind.fam10[(((i - 1) * 1000) + 1): (((i - 1) * 1000) + 1000)] <- curr.ind[1:1000]
}

# Create smaller dataset that has only the rows indexed
sc.ls10.red <- sc.ls10[red.ind.ls10,]
sc.fam10.red <- sc.fam10[red.ind.fam10,]

table(sc.ls10.red[,ls.p])
table(sc.fam10.red[,sponsor.require])

# Add two unauthorized migration modes

ill.mig.fam <- sc.fam10.red[,fi.cols] + sc.fam10.red[,sl.cols] 
ill.mig.ls <- sc.ls10.red[,fi.cols] + sc.ls10.red[,sl.cols]

# add them to the reduced dataset
sc.fam10.red <- cbind(sc.fam10.red,ill.mig.fam)
sc.ls10.red <- cbind(sc.ls10.red,ill.mig.ls)

# select the columns
ill.mig.cols <- c(701:761)


# FAMILY

# Create percentages of aspiring migrants for legal, illegal and non-migrant categories.
leg.fam.prop <- (sc.fam10.red[,legal.cols] / aspire.enough) * 100
illeg.fam.prop <- (sc.fam10.red[,ill.mig.cols] / aspire.enough) * 100
nm.fam.prop <- 100 - (leg.fam.prop + illeg.fam.prop) 

# Add to dataset
sc.fam10.red <- cbind(sc.fam10.red,leg.fam.prop,illeg.fam.prop,nm.fam.prop)

# Select columns
leg.prop.cols <- c(762:822)
illeg.prop.cols <- c(823:883)
nm.prop.cols <- c(884:944)

# FAMILY: Subract outcomes from the baseline values

leg.minus.base <- rep_len(0, nrow(sc.fam10.red))
illeg.minus.base <- rep_len(0, nrow(sc.fam10.red))
nm.minus.base <- rep_len(0, nrow(sc.fam10.red))

#For baseline, sponsor.require = 0. We have exactly 1000 runs per every level of sponsorship requirement. 
#We select that chunk of data and subtract the baseline chunk from it.

for (i in 1:length(unique.sr)){
  leg.minus.base[((1000 * (i - 1)) + 1):(1000 * i)] <- sc.fam10.red[((1000 * (i - 1)) + 1):(1000 * i), leg.prop.cols[time.point]] - sc.fam10.red[which(sc.fam10.red[,sponsor.require] == 0),leg.prop.cols[time.point]]
  illeg.minus.base[((1000 * (i - 1)) + 1):(1000 * i)] <- sc.fam10.red[((1000 * (i - 1)) + 1):(1000 * i), illeg.prop.cols[time.point]] - sc.fam10.red[which(sc.fam10.red[,sponsor.require] == 0),illeg.prop.cols[time.point]]
  nm.minus.base[((1000 * (i - 1)) + 1):(1000 * i)] <- sc.fam10.red[((1000 * (i - 1)) + 1):(1000 * i), nm.prop.cols[time.point]] - sc.fam10.red[which(sc.fam10.red[,sponsor.require] == 0),nm.prop.cols[time.point]]
}

x.fam <- sc.fam10.red[,sponsor.require]

#Aggregation

data1.fam <- as.data.frame(cbind(x.fam,leg.minus.base,illeg.minus.base, nm.minus.base))
# Get means, error, across repetitions per category
ag1.fam <- aggregate(. ~ x.fam, data1.fam, function(x) c(mean = mean(x), percentile = quantile(x, c(0.0275, 0.975))))
# Remove baseline setting minus baseline setting (gives 0)
ag1.fam <- ag1.fam[- which(ag1.fam$x.fam == 0),]

# Make new dataset with aggregated values, which will have a mean, lb and ub for each level of policy

fam.df <- as.data.frame(cbind(ag1.fam$x.fam, ag1.fam$leg.minus.base[,1], ag1.fam$leg.minus.base[,2], ag1.fam$leg.minus.base[,3],
                              ag1.fam$illeg.minus.base[,1], ag1.fam$illeg.minus.base[,2], ag1.fam$illeg.minus.base[,3], 
                              ag1.fam$nm.minus.base[,1], ag1.fam$nm.minus.base[,2], ag1.fam$nm.minus.base[,3]))


names(fam.df) <-  c("restrict","legal","legal.lb","legal.ub","illegal","illegal.lb","illegal.ub","nm","nm.lb", "nm.ub")


## LS

#percentage of aspiring migrants

leg.ls.prop <- (sc.ls10.red[,legal.cols] / aspire.enough) * 100
illeg.ls.prop <- (sc.ls10.red[,ill.mig.cols] / aspire.enough) * 100
nm.ls.prop <- 100 - (leg.ls.prop + illeg.ls.prop) 

sc.ls10.red <- cbind(sc.ls10.red,leg.ls.prop,illeg.ls.prop,nm.ls.prop)

leg.minus.base.ls <- rep_len(0, nrow(sc.ls10.red))
illeg.minus.base.ls <- rep_len(0, nrow(sc.ls10.red))
nm.minus.base.ls <- rep_len(0, nrow(sc.ls10.red))

# Subtract from baseline in same way

for (i in 1:length(unique.lsp)){
  
  leg.minus.base.ls[((1000 * (i - 1)) + 1):(1000 * i)] <- sc.ls10.red[((1000 * (i - 1)) + 1):(1000 * i), leg.prop.cols[time.point]] - sc.ls10.red[which(sc.ls10.red[,ls.p] == 1),leg.prop.cols[time.point]]
  illeg.minus.base.ls[((1000 * (i - 1)) + 1):(1000 * i)] <- sc.ls10.red[((1000 * (i - 1)) + 1):(1000 * i), illeg.prop.cols[time.point]] - sc.ls10.red[which(sc.ls10.red[,ls.p] == 1),illeg.prop.cols[time.point]]
  nm.minus.base.ls[((1000 * (i - 1)) + 1):(1000 * i)] <- sc.ls10.red[((1000 * (i - 1)) + 1):(1000 * i), nm.prop.cols[time.point]] - sc.ls10.red[which(sc.ls10.red[,ls.p] == 1),nm.prop.cols[time.point]]
}

x.ls <- (sc.ls10.red[,ls.p])

# 2.7 and 97.5 percentile 

data1.ls <- as.data.frame(cbind(x.ls,leg.minus.base.ls,illeg.minus.base.ls,nm.minus.base.ls))

ag1.ls <- aggregate(. ~ x.ls, data1.ls, function(x) c(mean = mean(x), percentile = quantile(x, c(0.0275, 0.975))))

# Remove baseline setting minus baseline setting (gives 0)
ag1.ls <- ag1.ls[- which(ag1.ls$x.ls == 1),]

ls.df <- as.data.frame(cbind(ag1.ls$x.ls, ag1.ls$leg.minus.base.ls[,1], ag1.ls$leg.minus.base.ls[,2], ag1.ls$leg.minus.base.ls[,3],
                             ag1.ls$illeg.minus.base.ls[,1], ag1.ls$illeg.minus.base.ls[,2], ag1.ls$illeg.minus.base.ls[,3],
                             ag1.ls$nm.minus.base.ls[,1],ag1.ls$nm.minus.base.ls[,2],ag1.ls$nm.minus.base.ls[,3]))


names(ls.df) <-  c("p.ls","legal","legal.lb","legal.ub","illegal","illegal.lb","illegal.ub","nm","nm.lb", "nm.ub")

ls.df$p.ls.rev <- 1 - ls.df$p.ls

ls.df <- ls.df[order(ls.df$p.ls.rev),]

## Wilcoxon Rank Sum Tests

# Family

uniq.xfam <- sort(unique(data1.fam$x.fam))
kwp.fam <- rep(999,length(uniq.xfam))

# Is there a significant difference between legal and illegal values?

for (i in 1: length(kwp.fam))
{
  t <- wilcox.test(data1.fam$illeg.minus.base[which(data1.fam$x.fam == uniq.xfam[i])], 
                   data1.fam$leg.minus.base[which(data1.fam$x.fam == uniq.xfam[i])], 
                   paired = F, alternative = "greater")
  
  ifelse (t[3] <= 0.05, kwp.fam[i] <- uniq.xfam[i], kwp.fam[i] <- 8888)
}

kwp.fam

kwp.fam[which(kwp.fam == 8888)] <- NA

kwp.fam

# First value is NA and can't be plotted, so extract non-NA policy values

kwp.fam.red <- kwp.fam[2:21]

kwp.fam.red
fam.df$restrict

### Low Skilled 

data1.ls$x.ls.rev <-  1 - data1.ls$x.ls

data1.ls.rev <- data1.ls[order(data1.ls$x.ls.rev),]

uniq.xls <- sort(unique(data1.ls.rev$x.ls.rev))
kwp.ls <- rep(999,length(uniq.xls))

for (i in 1: length(kwp.ls))
{
  t <- wilcox.test(data1.ls.rev$illeg.minus.base[which(data1.ls.rev$x.ls.rev == uniq.xls[i])], 
                   data1.ls.rev$leg.minus.base[which(data1.ls.rev$x.ls.rev == uniq.xls[i])], 
                   paired = F, alternative = "greater")
  
  ifelse (t[3] <= 0.05, kwp.ls[i] <- uniq.xls[i], kwp.ls[i] <- 8888)
}

kwp.ls

kwp.ls[which(kwp.ls == 8888)] <- NA

kwp.ls


# Wilcoxon rank tests to compare unauthorized to the baseline. 

sc.ls10.red[illeg.prop.cols[20]]
sc.ls10.red$p.ls.rev <- 1 - sc.ls10.red$low.skilled.p

baseline.ls <- sc.ls10.red[which(sc.ls10.red$p.ls.rev == 0),illeg.prop.cols[20]]
bwp.ls <- rep(999,length(uniq.xls))

for (i in 1:length(uniq.xls))
{
  
  w <- wilcox.test(sc.ls10.red[which(sc.ls10.red$p.ls.rev == uniq.xls[i]),illeg.prop.cols[20]], baseline.ls, paired = F, alternative = "greater")
  
  ifelse (w[3] > 0.05, bwp.ls[i] <- uniq.xls[i], bwp.ls[i] <- 8888)
}

bwp.ls
bwp.ls[which(bwp.ls == 8888)] <- NA
bwp.ls

# Legal from baseline

# Wilcoxon rank tests to compare unauthorized to the baseline. 

sc.ls10.red[leg.prop.cols[20]]

baseline.ls.leg <- sc.ls10.red[which(sc.ls10.red$p.ls.rev == 0),leg.prop.cols[20]]
bwp.ls.leg <- rep(999,length(uniq.xls))

for (i in 1:length(uniq.xls))
{
  
  w <- wilcox.test(sc.ls10.red[which(sc.ls10.red$p.ls.rev == uniq.xls[i]),leg.prop.cols[20]], baseline.ls.leg, paired = F, alternative = "less")
  
  ifelse (w[3] > 0.05, bwp.ls.leg[i] <- uniq.xls[i], bwp.ls.leg[i] <- 8888)
}

bwp.ls.leg
bwp.ls.leg[which(bwp.ls.leg == 8888)] <- NA
bwp.ls.leg



# Family -- difference from baseline, illegal

baseline.fam <- sc.fam10.red[which(sc.fam10.red$sponsor.require == 0),illeg.prop.cols[20]]

bwp.fam <- rep(999,length(uniq.xfam))

for (i in 1: length(bwp.fam))
{
  w <- wilcox.test(sc.fam10.red[which(sc.fam10.red$sponsor.require == uniq.xfam[i]),illeg.prop.cols[20]], baseline.fam, paired = F, alternative = "greater")
  
  ifelse (w[3] > 0.05, bwp.fam[i] <- uniq.xfam[i], bwp.fam[i] <- 8888)
}

bwp.fam

bwp.fam[which(bwp.fam == 8888)] <- NA

bwp.fam


# Family -- difference from baseline, illegal

baseline.fam.leg <- sc.fam10.red[which(sc.fam10.red$sponsor.require == 0),leg.prop.cols[20]]

bwp.fam.leg <- rep(999,length(uniq.xfam))

for (i in 1: length(bwp.fam.leg))
{
  w <- wilcox.test(sc.fam10.red[which(sc.fam10.red$sponsor.require == uniq.xfam[i]),leg.prop.cols[20]], baseline.fam.leg, paired = F, alternative = "less")
  
  ifelse (w[3] > 0.05, bwp.fam.leg[i] <- uniq.xfam[i], bwp.fam.leg[i] <- 8888)
}

bwp.fam.leg

bwp.fam.leg[which(bwp.fam.leg == 8888)] <- NA

bwp.fam.leg


# Select non-NA results for plotting

kwp.ls.red <- kwp.ls[2:length(kwp.ls)] 
kwp.ls.red

bwp.ls.red <- bwp.ls[2:length(bwp.ls)] 
bwp.ls.red



### Line Plots

# Setup wages for FAM proxy
options(scipen=10)
lo <- qplot(YRSUSA1,INCWAGE,data=lne1) + stat_smooth(aes(outfit=fit<<-..y..))
lo + ylab("Yearly Wage in Previous Calendar Year (USD)") + xlab("Years Since Migration to US")

fit

pick2s <- seq(from = 3, to = 41, by = 4) # Get fit values for graph ticks
# Check values
pick2s - 1

wage.ysm <-  fit[pick2s]
wage.ysm <- format(round(wage.ysm, digits = 0),big.mark=",",scientific=FALSE, trim = T)
wage.ysm.t <-  sub(",.*$","",wage.ysm)

plot(fit, type = "l", xlab = "Yearly Wage in Previous Calendar Year, USD", ylab = "", 
     las = 1)
mtext("Loess Fit", side = 2, line = 6)

# Setup ratio illegal:legal for plotting

rat.ill <- round(ls.df$illegal / abs(ls.df$legal),2)
rat.ill.fam <- round(fam.df$illegal / abs(fam.df$legal), 2)


## Plots

baseline <- data.frame( x = c(-Inf, Inf), y = 0, baseline = factor(1) )
#grob <- grobTree(textGrob("Baseline", x=1.05,  y=0.56, hjust=0, rot = 90,
 #                         gp=gpar(col="black", fontsize=13, fontface="italic")))

#grob2 <- grobTree(textGrob("% Aspiring Migrants", x=-0.2,  y=0.3, hjust=0, rot = 90,
 #                          gp=gpar(col="black", fontsize=15, fontface="bold")))


lay <- rbind(1,2)
#Fig 3

dev.off()

q <- ggplot(data = ls.df, xlim = c(0,1)) +
  geom_line(aes(x=p.ls.rev, y=legal,  color = "Legal", linetype = "Legal"), size = 1) +
  geom_ribbon(aes(x=p.ls.rev, ymin=legal.lb, ymax = legal.ub), fill="dodgerblue", alpha = 0.3) +
  geom_line(aes(x=p.ls.rev, y=illegal, color = "Unauth.", linetype = "Unauth."), size = 1) +
  geom_ribbon(aes(x=p.ls.rev, ymin=illegal.lb, ymax = illegal.ub), fill="firebrick1", alpha = 0.3) +
  geom_line(aes( x, y, color= "Baseline", linetype = "Baseline"), baseline, size = 1) +
  geom_rug(aes(bwp.ls.red), col = "gray30", size = 1.2) +
  scale_color_manual(name = '', values=c('Baseline' = 'gray40','Legal'='cadetblue','Unauth.'='firebrick1')) +
  scale_linetype_manual(name = '', values=c('Baseline'=3,'Legal'=1,'Unauth.'=1)) +
  scale_y_continuous("% Aspiring Migrants") + xlab("Pr(Failure), Low-Skilled Visa") +
  scale_x_continuous(breaks= seq(0,1,0.1))

q <- q + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
                            axis.text=element_text(size=12), 
                            axis.title=element_text(size=14), 
                            legend.position = "top", legend.text=element_text(size=12)) +
  theme(plot.margin=unit(c(0.2,2,0.5,1.4),"cm")) + theme(legend.key.size =  unit(0.5, "in"))

q

h <- ggplot(xlim = c(0,1)) +
  geom_line(aes(x=ls.df$p.ls.rev, y=rat.ill), size = 1, linetype = 1) +
  geom_rug(aes(bwp.ls.red), col = "gray30", size = 1.2) +
  scale_y_continuous("Unauth.:Legal") + xlab("Pr(Failure), Low-Skilled Visa") + 
  scale_x_continuous(breaks= seq(0,1,0.1))
h <- h + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                            axis.text=element_text(size=12), 
                            axis.title=element_text(size=14)) + 
  theme(plot.margin=unit(c(0.2,2,0.5,1.4),"cm"))

pdf("Fig3.pdf",width=5,height=6)
grid.arrange(q,h, layout_matrix = lay, heights = c(3,0.7))

dev.off()



#Fig 4

dev.off()

g <- ggplot(data = fam.df) +
  geom_line(aes(x=restrict, y=legal,  color = "Legal", linetype = "Legal"), size = 1) +
  geom_ribbon(aes(x=restrict, ymin=legal.lb, ymax = legal.ub), fill="dodgerblue", alpha = 0.3) +
  geom_line(aes(x=restrict, y=illegal, color = "Unauth.", linetype = "Unauth."), size = 1) +
  geom_line(aes( x, y, color= "Baseline", linetype = "Baseline"), baseline, size = 1) +
  geom_ribbon(aes(x=restrict, ymin=illegal.lb, ymax = illegal.ub), fill="firebrick1", alpha = 0.3) +
  scale_color_manual(name = '', values=c('Baseline' = 'gray40','Legal'='cadetblue','Unauth.'='firebrick1')) +
  scale_linetype_manual(name = '', values=c('Baseline' = 3,'Legal'=1,'Unauth.'=1)) +
  scale_y_continuous("% Aspiring Migrants") + xlab("Yrs. Abroad (Sponsor Requirement)") +
  theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
                     axis.text=element_text(size=12), 
                     axis.title=element_text(size=12), 
                     legend.position = "top", legend.text=element_text(size=12)) +
  theme(plot.margin=unit(c(0.2,2,0.5,1.4),"cm")) + theme(legend.key.size =  unit(0.5, "in")) +
  scale_x_continuous(limits = c(2,40), breaks= seq(2,40,4),
                     sec.axis = sec_axis(~ . * 1, breaks= seq(2,40,4), 
                                         labels = wage.ysm.t,
                                         name = "Yearly Wage (USD, Thousands)"))


h <- ggplot(xlim = c(0,1)) +
  geom_line(aes(x=fam.df$restrict, y=rat.ill.fam), size = 0.8, linetype = 1, col = "black") +
  scale_y_continuous("Unauth.:Legal") + xlab("Yrs. Abroad (Sponsor Requirement)") + 
  scale_x_continuous(breaks= seq(2,40,4))
h <- h + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                            axis.text=element_text(size=12), 
                            axis.title=element_text(size=14)) +
  theme(plot.margin=unit(c(0,2,0.1,1.2),"cm"))

pdf("Fig4.pdf",width=5,height=6)
grid.arrange(g,h, layout_matrix = lay, heights = c(3,0.7))
dev.off()

